Libraries:

library(ggplot2) # ggtree is based on ggplot and uses a lot of its code
library(ggtree) # ggtree
library(treeio) # Loads up Beast trees
library(ggnewscale) # For adding new heatmaps (annotations)
library(scales) # For colours
library(gplots) # Also for colours

Read in files

# Read in beast tree with treeio package
mcc_tree <- treeio::read.beast(mcc_tree_file)
metadata <- read.csv(metadata_file)
beast_clusters <- read.csv(beast_clusters_file, header = F, col.names = c("cluster", "id"))

Have a look at the Beast file

mcc_tree
## 'treedata' S4 object that stored information of
##  'beast_results/PAKISTAN_ALL.mcc.tree'.
## 
## ...@ phylo: 
## Phylogenetic tree with 122 tips and 121 internal nodes.
## 
## Tip labels:
##   ERR3335723_2016, ERR3335724_2016, ERR3335725_2016, ERR3335726_2016, ERR3335727_2016, ERR3335728_2016, ...
## 
## Rooted; includes branch lengths.
## 
## with the following features available:
##  'CAheight_0.95_HPD',    'CAheight_mean',    'CAheight_median',  'CAheight_range',
##  'height',   'height_0.95_HPD',  'height_median',    'height_range', 'length',
##  'length_0.95_HPD',  'length_median',    'length_range', 'posterior'.

There's lots of metadata here that can be used in the trees (the "with the following features available:" bit)

Set up tree metadata - first and last dates etc

# I think this is how you get the first (estimated) date of the Beast tree - i.e. the root. 
# My root year happens to be 0, but if the root date was different, then I suspect the root.edge would be different 
# (i.e. root.edge is not actually in years)
first_date <- mcc_tree@phylo$root.edge
# Get the last overall date from the sample names - year after the last underscore
last_date <- as.numeric(max(unlist(lapply(strsplit(mcc_tree@phylo$tip.label, "_"), 
                                             function(x){x[length(x)]}))))
# In ggtree you have to give the last date as a character string in the form of YYYY-MM-DD
last_date_chr <- paste0(as.character(last_date), "-12-31")
# Total n samples
n_samps <- length(mcc_tree@phylo$tip.label)

Trees

Set up params/values for the trees:

# Expand the x-axis so there's a bit of room at the start and a very large space (80% of the full timeline) 
# to the right for the annotations
x_lim <- c(first_date-10, last_date+(last_date*0.8))
# Expand the y-axis so there's room at the top and bottom
y_lim <- c(-5, n_samps + (n_samps * 0.1))
# Define vertical line colour
v_line_col <- "red"
# Define sequence of year ticks on the x-axis
x_labs_full <- seq(first_date, last_date, by = 100)
# Angle of the labels so they fit in
angle <- 45

Draw basic tree

ggtree(mcc_tree, mrsd = last_date_chr) 

Add date scale

ggtree(mcc_tree, mrsd = last_date_chr) +
  theme_tree2() 

Add annotations

Code for the tip labels is shown, but commented

ggtree(mcc_tree, mrsd = last_date_chr) + 
  theme_tree2() +
  # Tip labels
  # geom_tiplab(align=TRUE, linetype='dashed', linesize=.3, size = 2) +
  # Red confidence bars
  geom_range("length_0.95_HPD", color='red', size=2, alpha=.5) +
  # Text labels for branch posterior scores
  # Note - you can filter the values - here only 0.9 and above are shown.
  # Also can control the position with the vjust arg. 
  geom_text2(aes(label=round(as.numeric(posterior), 2),
                 subset=as.numeric(posterior)> 0.9,
                 x=branch), vjust=0) +
  # Set the y-axis as defined above
  coord_cartesian(ylim = y_lim) +
  # Set the x-axis limits, and year tick labels (don't know why you need 'breaks' as well as 'labels')
  scale_x_continuous(limits = x_lim, breaks = x_labs_full, labels = x_labs_full)+
  # Set the appearance of the year labels, including angle
  theme(axis.text.x = element_text(face = "bold", size = 8, angle = 45, hjust = 1))

Add the metadata annotations to the big space on the right

Setup

Prep data for heatmaps (strips)

# Add id/year hybrid col so samples names are correct - i.e. they match the Beast tree sample names
metadata$id_year <- paste0(metadata$wgs_id, "_", metadata$year)
rownames(metadata) <- metadata$id_year

# Subset metadata 
# FORMAT: one col per attribute, rownames of dataframe match names of samples in the tree
lin_data <- metadata[, "main_lineage", drop = F]
dr_status_data <- metadata[, "dr_status", drop = F]

# Convert lin and DR data to factors
lin_data <- data.frame(apply(lin_data, 2, as.factor))

# Change column names for neatness on tree
colnames(dr_status_data) <- "DR status"
colnames(lin_data) <- "Lineage"

Drug resistance data now looks like this. Note - the sample names are the rownames of the one-column dataframe. Also note - some sample names are "XXXX_NA". This is because they've been pulled from the metadata and not all samples in the metadata have years assigned to them and therefore weren't used to create the tree. ggtree will find the ones that are in both the metadata and the tree, so it doesn't matter if you have extra samples in your metadata.

head(dr_status_data)
##               DR status
## ERR688030_NA        XDR
## ERR688043_NA        XDR
## ERR1213917_NA Sensitive
## ERR2510632_NA   Pre-MDR
## ERR2510285_NA       MDR
## ERR2510586_NA       MDR

Lineage data is very similar:

head(lin_data)
##               Lineage
## ERR688030_NA        4
## ERR688043_NA        4
## ERR1213917_NA       3
## ERR2510632_NA       3
## ERR2510285_NA       3
## ERR2510586_NA       3

Colours

# Define cols for each dataset - one colour per unique value of the datasets
alpha <- 0.9
lin_colours <- rainbow(length(unique(metadata$main_lineage)), alpha = alpha)
dr_status_colours <- scales::alpha(gplots::col2hex(c("green1", "yellow2", "orange1", "red1", "black", "grey")), alpha = alpha)

# Add names (unique values for each dataset) to dataframes. 
names(lin_colours) <- c(sort(unique(metadata$main_lineage)))
names(dr_status_colours) <- c("Sensitive", "Pre-MDR", "MDR", "Pre-XDR", "XDR", "Other")

Colour vectors:

lin_colours
##           1           2           3           4 
## "#FF0000E6" "#80FF00E6" "#00FFFFE6" "#8000FFE6"
dr_status_colours
##   Sensitive     Pre-MDR         MDR     Pre-XDR         XDR       Other 
## "#00FF00E6" "#EEEE00E6" "#FFA500E6" "#FF0000E6" "#000000E6" "#BEBEBEE6"

Parameters for the strip annotations

# Where the annotations are placed (in this case 50 years to the right of the last date)
offset <- 50
# Width of the heatmap strip
width <- 0.05

To add the heatmap/strip annotation, draw the same tree as above and save as an object, then pass to gheatmap()

gg_mcmc_tree <- ggtree(mcc_tree, mrsd = last_date_chr)+
  theme_tree2() +
  # geom_tiplab(align=TRUE, linetype='dashed', linesize=.3, size = 2) + 
  geom_range("length_0.95_HPD", color='red', size=2, alpha=.5) +
  geom_text2(aes(label=round(as.numeric(posterior), 2),
                 subset=as.numeric(posterior)> 0.9,
                 x=branch), vjust=0) +
  coord_cartesian(ylim = y_lim) +
  scale_x_continuous(breaks = x_labs_full, labels = x_labs_full, limits = x_lim)+
  theme(axis.text.x = element_text(face = "bold", size = 8, angle = 45, hjust = 1))

# Add lineage data 
lin_hm <- gheatmap(gg_mcmc_tree, lin_data,
                   width = width,
                   offset = offset, 
                   # Not sure what the color arg does or why it has to be NULL. Rest of the args are self-explanatory
                   color = NULL,
                   colnames_position = "top",
                   colnames_angle = angle, 
                   colnames_offset_y = 1,
                   hjust = 0,
                   font.size = 3) +
  # Add the custom colours defined above
  scale_fill_manual(values = lin_colours, breaks = names(lin_colours) )+
  # Define the legend title
  labs(fill = "Lineage")

lin_hm

Note - the scale_fill_manual() function took ages to work out - the values are the colours themseves, i.e. #FF0000E6, #80FF00E6, #00FFFFE6, #8000FFE6, and the breaks is the unique values in the data (i.e. 1, 2, 3, 4), which we've used to name the colour vector.

Add another strip (heatmap) - DR data

To do this we have to add the saved tree to the new_scale_fill() function from the ggnewscale package # See "7.3.1 Visualize tree with multiple associated matrix" - https://yulab-smu.top/treedata-book/chapter7.html

It is possible to just have a dataset with two columns (i.e. in this case the lineage and DR data) and to concattenate the colours together, but this would mean there's only one legend and the whole thing is less clear.

lin_hm <- lin_hm + ggnewscale::new_scale_fill() 

Note - offset has to be increased otherwise the strip will be drawn over the top of the previous one. The rest of the code is the same except for the tree input, data input, colours and legend label

dr_status_hm <- gheatmap(lin_hm, dr_status_data,
                         width = width,
                         # Increase offset
                         offset = offset+100, 
                         color = NULL,
                         colnames_position = "top",
                         colnames_angle = angle, 
                         colnames_offset_y = 1,
                         hjust = 0,
                         font.size = 3) +
  # Add colours for DR status
  scale_fill_manual(values = dr_status_colours, breaks = names(dr_status_colours) )+
  # New legend label
  labs(fill = "DR\nstatus")

dr_status_hm

Add binary dataset

The last annotation to add is a binary dataset of individual drugs - 0/1 whether the sample is resistant to the drug

Get data and convert numeric 0/1 to factors

dr_data <- dplyr::select(metadata, rifampicin:delamanid)
dr_data <- data.frame(apply(dr_data, 2, as.factor))

Lots of columns with 0/1/NA

head(dr_data)
##               rifampicin isoniazid ethambutol pyrazinamide streptomycin
## ERR688030_NA        <NA>      <NA>       <NA>         <NA>         <NA>
## ERR688043_NA        <NA>      <NA>       <NA>         <NA>         <NA>
## ERR1213917_NA          0         0          1         <NA>            0
## ERR2510632_NA          0         1          0            0         <NA>
## ERR2510285_NA          1         1          0            1         <NA>
## ERR2510586_NA          1         1          0            1         <NA>
##               ofloxacin moxifloxacin levofloxacin amikacin kanamycin
## ERR688030_NA       <NA>         <NA>         <NA>     <NA>      <NA>
## ERR688043_NA       <NA>         <NA>         <NA>     <NA>      <NA>
## ERR1213917_NA      <NA>         <NA>         <NA>     <NA>      <NA>
## ERR2510632_NA      <NA>         <NA>         <NA>     <NA>      <NA>
## ERR2510285_NA      <NA>         <NA>         <NA>     <NA>      <NA>
## ERR2510586_NA      <NA>         <NA>         <NA>     <NA>      <NA>
##               capreomycin ciprofloxacin prothionamide ethionamide
## ERR688030_NA         <NA>          <NA>          <NA>        <NA>
## ERR688043_NA         <NA>          <NA>          <NA>        <NA>
## ERR1213917_NA        <NA>          <NA>          <NA>        <NA>
## ERR2510632_NA        <NA>          <NA>          <NA>        <NA>
## ERR2510285_NA        <NA>          <NA>          <NA>        <NA>
## ERR2510586_NA        <NA>          <NA>          <NA>        <NA>
##               clarithromycin clofazimine bedaquiline cycloserine linezolid
## ERR688030_NA            <NA>        <NA>        <NA>        <NA>      <NA>
## ERR688043_NA            <NA>        <NA>        <NA>        <NA>      <NA>
## ERR1213917_NA           <NA>        <NA>        <NA>        <NA>      <NA>
## ERR2510632_NA           <NA>        <NA>        <NA>        <NA>      <NA>
## ERR2510285_NA           <NA>        <NA>        <NA>        <NA>      <NA>
## ERR2510586_NA           <NA>        <NA>        <NA>        <NA>      <NA>
##               para_aminosalicylic_acid rifabutin delamanid
## ERR688030_NA                      <NA>      <NA>      <NA>
## ERR688043_NA                      <NA>      <NA>      <NA>
## ERR1213917_NA                     <NA>      <NA>      <NA>
## ERR2510632_NA                     <NA>      <NA>      <NA>
## ERR2510285_NA                     <NA>      <NA>      <NA>
## ERR2510586_NA                     <NA>      <NA>      <NA>

Add the tree to new_scale_fill() again:

dr_status_hm <- dr_status_hm + ggnewscale::new_scale_fill() 

This time color is changed to "black" I think as the 'base' colour, and low and high added. I think these assign the 0 value to white and the 1 to black. NA is defined in the scale_fill_manual() function.

This time in scale_fill_manual(), instead of breaks, for some reason labels is used to map the 0/1 to the legend labels.

gheatmap(dr_status_hm, dr_data,
         # Increase offset
         offset = offset+200,
         width = width+0.5,
         # Change color to black
         # color = NULL,
         color="black",
         low="white", 
         high="black", 
         colnames_position = "top",
         colnames_angle = angle, 
         colnames_offset_y = 1,
         hjust = 0,
         font.size = 2.5) +
  # Define colours
  scale_fill_manual(values=c("white", "black"), labels = c("Sensitive", "Resistant", "NA"), na.value = "grey")+
  labs(fill = "Drug\nresistance")

The drug binary data looks a bit squashed on this webpage but I'll show how to save to pdf later.

Actually the whole tree is a bit squashed at the tips because the samples were taken from possible transmission cases, but are also from different lineages so have MRCA a long time ago. So it might be useful to zoom in on the tips.

Zoom in to the tree

If we specify a year to limit the x-axis, then only those clades that have MRCA after this year are drawn, which might be useful for zooming in on clusters.

Zoom tree setup

The code is basically the same, but the x-axis limits are changed. Note - the offset and width have to be much smaller.

I've also added a vertical red line to show when the last date was.

zoom_range <- c((last_date+1) - 50, (last_date + 1) + 50)
zoom_range_seq <- seq(zoom_range[1], (last_date+1), 2)

offset_zoom <- 0
width_zoom <- 0.001
gg_mcmc_tree_zoom <- ggtree(mcc_tree, mrsd = last_date_chr) +
  theme_tree2() +
  coord_cartesian(ylim = y_lim) +
  scale_x_continuous(breaks = zoom_range_seq,
                     labels = zoom_range_seq,
                     limits = zoom_range) +
  # Add vertical line
  geom_vline(aes(xintercept = 2020), col = "red")+
  theme(axis.text.x = element_text(face = "bold", size = 8, angle = 45, hjust = 1))

# Add lin data 
lin_hm <- gheatmap(gg_mcmc_tree_zoom, lin_data,
                   width = width_zoom,
                   offset = offset_zoom, 
                   color = NULL,
                   colnames_position = "top",
                   colnames_angle = angle, colnames_offset_y = 1,
                   hjust = 0,
                   font.size = 3) +
  scale_fill_manual(values = lin_colours, breaks = names(lin_colours) )+
  labs(fill = "Lineage")

lin_hm <- lin_hm + ggnewscale::new_scale_fill() 

# Add DR status
dr_status_hm <- gheatmap(lin_hm, dr_status_data,
                         width = width_zoom,
                         offset = offset_zoom + 2, 
                         color = NULL,
                         colnames_position = "top",
                         colnames_angle = angle, colnames_offset_y = 1,
                         hjust = 0,
                         font.size = 3) +
  scale_fill_manual(values = dr_status_colours, breaks = names(dr_status_colours) )+
  labs(fill = "DR\nstatus")

# Add DR individual status
dr_status_hm <- dr_status_hm + ggnewscale::new_scale_fill() 

gheatmap(dr_status_hm, dr_data,
         offset = offset_zoom + 4,
         width = width_zoom + 0.02,
         # color = NULL,
         low="white", high="black", color="black",
         colnames_position = "top",
         colnames_angle = angle, 
         colnames_offset_y = 1,
         hjust = 0,
         font.size = 2.5, 
         legend_title = "llw") +
  scale_fill_manual(values=c("white", "black"), labels = c("Sensitive", "Resistant", "NA"), na.value = "grey")+
  labs(fill = "Drug\nresistance")

Loop over data to plot individual clusters

If we want to plot these individual clusters it's possible to loop over lists of samples, subset the tree, and run the ggtree code

This is the list of samples and their groups - all the clusters that result from cutting the tree at 50 years before the last sample. This file was created with my cut_tree.py function.

head(beast_clusters)
##   cluster                   id
## 1       1      ERR3335724_2016
## 2       1      ERR3335759_2016
## 3       2      ERR3335726_2016
## 4       2      ERR3335728_2016
## 5       2 taj_pakistan_05_2017
## 6       2 taj_pakistan_11_2017

Split the file into each cluster to loop over:

clusters_split <- split(beast_clusters, beast_clusters$cluster)
head(clusters_split)
## $`1`
##   cluster              id
## 1       1 ERR3335724_2016
## 2       1 ERR3335759_2016
## 
## $`2`
##   cluster                   id
## 3       2      ERR3335726_2016
## 4       2      ERR3335728_2016
## 5       2 taj_pakistan_05_2017
## 6       2 taj_pakistan_11_2017
## 7       2 taj_pakistan_57_2019
## 
## $`3`
##    cluster                   id
## 8        3      ERR3335789_2017
## 9        3      ERR3335792_2017
## 10       3      ERR3335799_2017
## 11       3 taj_pakistan_09_2017
## 12       3 taj_pakistan_44_2017
## 
## $`4`
##    cluster              id
## 13       4 ERR3335765_2016
## 14       4 ERR3335778_2016
## 15       4 ERR3335782_2017
## 
## $`5`
##    cluster              id
## 16       5 ERR3335752_2016
## 17       5 ERR3335797_2017
## 
## $`6`
##    cluster              id
## 18       6 ERR3335756_2016
## 19       6 ERR3335764_2016
## 20       6 ERR3335758_2016

Notes:

  • Usually I'd save these as a pdf (one plot per page, and one page per loop) - the relevent code is pdf(file = <filename>) at the start of the loop to open the pdf file, print(<plot_name>) during the loop, and dev.off() at the end to finish saving it. These lines are commented here, except the print(<plot_name>) so I can show the plots here.

  • There's only one function in treeio to subset trees, which is treeio::drop.tip, so I have to first get the samples that aren't in the cluster of interest with %in%

  • I use a new way here of getting first and last dates by creating the basic tree first - the theme_tree2() obviously does some calculations on the edge lengths to get the dates, but I haven't pulled out the function, rather I just run it and get the dates.

  • Here I program the x-axis scale and labels according to the order of magnitude (oom) of the date range. If I just leave it to the defaults it's a bit a of a mess. The log10_ceiling() function determines the oom.

  • I also have to determine the x-axis ranges and the space to the right of the tree for the annotations by taking proportions of the date range - so if a cluster's MRCA stretches back 50 years compared to 5 years, the spaces can get too squashed or stretched, so evserything's done as percentages of the time span. This also applies to the offsets and widths, which are dependent on the x-axis.

  • Similarly, I also have to change the y-axis space because the number of samples in each cluster changes a lot.

# Get sample names from whole tree
mcc_tree_tips <- mcc_tree@phylo$tip.label

# pdf(file = beast_clusters_pdf_file)
for(i in seq(clusters_split)){
  
  # Get the names of samples from cluster i
  names_to_keep <- clusters_split[[i]]$id
  # Get the names from the full Beast tree that aren't these names
  names_to_drop <- mcc_tree_tips[!(mcc_tree_tips %in% names_to_keep)]
  # Create new sub tree 
  clust_tree <- treeio::drop.tip(mcc_tree, names_to_drop)
  
  # Do tree first to get first and last date and ranges
  last_date_clust <- as.numeric(max(unlist(lapply(strsplit(clust_tree@phylo$tip.label, "_"), function(x){x[length(x)]}))))
  last_date_clust_chr <- paste0(last_date_clust-1, "-12-31")
  gg_clust_tree <- ggtree(clust_tree, mrsd = last_date_clust_chr) +
    theme_tree2()
  last_date_clust <- ceiling(max(gg_clust_tree$data$x))
  first_date_clust <- floor(min(gg_clust_tree$data$x))
  date_range_clust <- c(first_date_clust, last_date_clust)
  year_span_clust <- diff(date_range_clust)
  
  # N samples in cluster i
  n_samps <- length(clust_tree@phylo$tip.label)
  
  # Get order of magnitude of the date range
  oom <- log10_ceiling(year_span_clust)
  if(oom == 1){
    by <- 1
  }else if(oom == 10){
    by <- 2
  }else if(oom == 100){
    by <- 10
  }else{
    by <- 100
  }
  # Create seq of dates for x-axis
  zoom_range_seq_clust <- seq(first_date_clust,
                              last_date_clust,
                              by = by)
  
  # Determine space to right of tree for annotations as a multiple of total time range
  post_tree_multiply <- 5
  post_tree_span <- (year_span_clust * post_tree_multiply)
  x_lim_clust <- c(first_date_clust - (year_span_clust*0.2), last_date_clust + post_tree_span)
  # Same for y-axis according to number of samples
  y_lim_clust <- c(0 - floor(n_samps*0.1),  n_samps + ceiling(n_samps*0.2) )
  # Sort out the offsets and widths
  width_clust <- 0.035 * post_tree_multiply  # % of tree time span
  offset_clust <- (width_clust * year_span_clust)
  
  # Make tree
  gg_clust_tree <- ggtree(clust_tree, mrsd = last_date_clust_chr) %<+% metadata +
    theme_tree2() +
    coord_cartesian(ylim = y_lim_clust) +
    scale_x_continuous(breaks = zoom_range_seq_clust,
                       labels = zoom_range_seq_clust,
                       limits = x_lim_clust) +
    geom_vline(aes(xintercept = last_date_clust), col = "red")+
    theme(axis.text.x = element_text(face = "bold", size = 8, angle = 45, hjust = 1))
  
  # Add lin data
  lin_hm_clust <- gheatmap(gg_clust_tree, lin_data,
                           width = width_clust,
                           offset = 0,
                           color = NULL,
                           colnames_position = "top",
                           colnames_angle = angle,
                           # colnames_offset_y = 1,
                           hjust = 0,
                           font.size = 3) +
    scale_fill_manual(values = lin_colours, breaks = names(lin_colours) )+
    labs(fill = "Lineage")
  
  lin_hm_clust <- lin_hm_clust + ggnewscale::new_scale_fill()
  
  # Add DR status clusters
  dr_status_hm_clust <- gheatmap(lin_hm_clust, dr_status_data,
                                 width = width_clust,
                                 offset = offset_clust,
                                 color = NULL,
                                 colnames_position = "top",
                                 colnames_angle = angle,
                                 # colnames_offset_y = 1,
                                 hjust = 0,
                                 font.size = 3) +
    scale_fill_manual(values = dr_status_colours, breaks = names(dr_status_colours) )+
    labs(fill = "DR\nstatus")
  
  dr_status_hm_clust <- dr_status_hm_clust + ggnewscale::new_scale_fill()
  
  final_plot <- gheatmap(dr_status_hm_clust, dr_data,
                         width = width_clust * ncol(dr_data),
                         # offset = ceiling(offset_clust * 3),
                         offset = (offset_clust * 2) + (offset_clust * 0.2),
                         # color = NULL,
                         low="white", high="black", color="black",
                         colnames_position = "top",
                         colnames_angle = angle,
                         colnames_offset_y = 0,
                         hjust = 0,
                         font.size = 2.5,
                         legend_title = "llw") +
    scale_fill_manual(values=c("white", "black"), labels = c("Sensitive", "Resistant", "NA"), na.value = "grey")+
    labs(fill = "Drug\nresistance")
  
  print(final_plot)
  
}

# dev.off()